In this part, we will analyze copy number signatures across cancer types and show the landscape.
Signature number and contribution in each cancer type
Load tidy cancer type annotation data.
library(sigminer)
library(tidyverse)
pcawg_types <- readRDS("../data/pcawg_type_info.rds")
To better describe the copy number signature landscape, here we use refitting activity data obtained from bootstrap procedure (see section “Reliable signature activity attribution” in PART 1).
pcawg_activity <- readRDS("../data/pcawg_cn_sigs_CN176_activity.rds")
Combine the cancer type annotation and activity data and only keep samples with good reconstruction (>0.75 cosine similarity).
keep_samps <- pcawg_activity$similarity >= 0.75
df_abs <- merge(pcawg_activity$abs_activity[keep_samps], pcawg_types, by = "sample")
df_rel <- merge(pcawg_activity$rel_activity[keep_samps], pcawg_types, by = "sample")
Signature activity in each cancer type
Here we draw distribution of a signature across cancer types.
show_group_distribution(
df_abs,
gvar = "cancer_type",
dvar = "Sig1",
order_by_fun = FALSE,
g_angle = 90,
point_size = 0.3
)
We have many signatures here, so we output them to PDF files.
dir.create("../output/cancer-type-dist", showWarnings = F)
signames <- paste0("Sig", 1:11)
for (i in signames) {
pxx <- show_group_distribution(df_abs,
gvar = "cancer_type",
dvar = i, order_by_fun = FALSE,
ylab = i,
g_angle = 90, point_size = 0.3
)
ggplot2::ggsave(file.path("../output/cancer-type-dist/", paste0("Absolute_activity_", i, ".pdf")),
plot = pxx, width = 12, height = 6
)
pxx <- show_group_distribution(df_rel,
gvar = "cancer_type",
dvar = i, order_by_fun = FALSE,
ylab = i,
g_angle = 90, point_size = 0.3
)
ggplot2::ggsave(file.path("../output/cancer-type-dist/", paste0("Relative_activity_", i, ".pdf")),
plot = pxx, width = 12, height = 6
)
}
rm(pxx)
Signature landscape
Define a signature which is detectable if this signature contribute >5% exposures in a sample.
df <- df_rel %>%
dplyr::mutate_at(dplyr::vars(dplyr::starts_with("Sig")), ~ ifelse(. > 0.05, 1L, 0L)) %>%
tidyr::pivot_longer(
cols = dplyr::starts_with("Sig"),
names_to = "sig", values_to = "detectable"
)
df2 <- df_rel %>%
tidyr::pivot_longer(
cols = dplyr::starts_with("Sig"),
names_to = "sig", values_to = "expo"
)
df <- dplyr::left_join(df, df2,
by = c("sample", "cancer_type", "sig")
)
df_type <- df %>%
dplyr::group_by(cancer_type, sig) %>%
dplyr::summarise(
freq = sum(detectable), # directly use count
expo = median(expo[detectable == 1]),
n = n(),
label = paste0(unique(cancer_type), " (n=", n, ")"),
.groups = "drop"
)
mps <- unique(df_type[, c("cancer_type", "label")])
mpss <- mps$label
names(mpss) <- mps$cancer_type
summary(df_type$freq)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 6.00 15.00 30.42 36.00 247.00
Show copy number signature landscape.
library(cowplot)
p <- ggplot(
df_type,
aes(x = cancer_type, y = factor(sig, levels = paste0("Sig", 1:11)))
) +
geom_point(aes(size = freq, color = expo)) +
theme_cowplot() +
ggpubr::rotate_x_text(60) +
scale_x_discrete(breaks = mps$cancer_type, labels = mps$label) +
scale_size_continuous(
limits = c(5, 250),
breaks = c(5, 20, 50, 100, 200)
) +
scale_color_stepsn(
colors = viridis::viridis(5, direction = -1),
breaks = c(0, 0.25, 0.5, 0.75, 1)
) +
labs(
x = NULL, y = "Copy number signatures",
color = "Median activity\ndue to signature",
size = "Tumors with\nthe signature"
)
p
ggsave("../output/CNS_PCAWG_landscape.pdf",
plot = p,
height = 6, width = 12
)
Cancer type associated enrichment
Run enrichment analysis.
enrich_result <- group_enrichment(
df_abs,
grp_vars = "cancer_type",
enrich_vars = paste0("Sig", 1:11),
co_method = "wilcox.test"
)
Show enrichment landscape.
enrich_result$enrich_var <- factor(enrich_result$enrich_var, paste0("Sig", 1:11))
p <- show_group_enrichment(enrich_result, fill_by_p_value = TRUE, return_list = T)
p <- p$cancer_type + labs(x = NULL, y = NULL)
p
ggsave("../output/CNS_PCAWG_enrichment_landscape.pdf",
plot = p,
height = 8, width = 6
)
To better visualize the enrichment results, we use binned color regions.
p <- show_group_enrichment(
enrich_result,
fill_by_p_value = TRUE,
cut_p_value = TRUE,
return_list = T
)
p <- p$cancer_type + labs(x = NULL, y = NULL)
p
ggsave("../output/CNS_PCAWG_enrichment_landscape2.pdf",
plot = p,
height = 8, width = 6
)
We see cancer type SoftTissue-Liposarc has pretty high enrichment on Sig6. Let’s check the enrichment result.
enrich_result[grp1 == "SoftTissue-Liposarc"]
grp_var enrich_var grp1 grp2 grp1_size grp1_pos_measure
1: cancer_type Sig1 SoftTissue-Liposarc Rest 19 107.747229
2: cancer_type Sig2 SoftTissue-Liposarc Rest 19 45.210722
3: cancer_type Sig3 SoftTissue-Liposarc Rest 19 17.841799
4: cancer_type Sig4 SoftTissue-Liposarc Rest 19 53.524895
5: cancer_type Sig5 SoftTissue-Liposarc Rest 19 9.332606
grp2_size grp2_pos_measure measure_observed measure_tested p_value
1: 2621 17.690767 6.0905910 NA 2.618595e-02
2: 2621 14.791334 3.0565682 NA 5.269566e-01
3: 2621 15.222509 1.1720669 NA 7.519819e-02
4: 2621 13.483545 3.9696455 NA 1.051345e-03
5: 2621 14.027543 0.6653058 NA 2.968415e-02
type method
1: continuous wilcox.test
2: continuous wilcox.test
3: continuous wilcox.test
4: continuous wilcox.test
5: continuous wilcox.test
[ reached getOption("max.print") -- omitted 6 rows ]
We can see mean activity 486 (n=19) vs 9 (n=2621).
Let’s go further plot the distribution for the two groups.
df_check <- df_abs[, c("Sig6", "cancer_type")][
, .(
cancer_type = ifelse(cancer_type == "SoftTissue-Liposarc",
"SoftTissue-Liposarc",
"Others"
),
Sig6 = Sig6
)
]
ggpubr::ggboxplot(
df_check,
x = "cancer_type", y = "Sig6",
fill = "cancer_type",
xlab = FALSE, width = 0.3
)
Check copy number distribution for the
"SoftTissue-Liposarc" samples.
samples <- df_abs[cancer_type == "SoftTissue-Liposarc"]$sample
pcawg_cn_obj <- readRDS("../data/pcawg_cn_obj.rds")
cn_dt <- subset(pcawg_cn_obj@data, sample %in% samples)
cn_dt$segLen <- cn_dt$end - cn_dt$start + 1
Copy number value:
boxplot(cn_dt$segVal)
Segment length:
boxplot(cn_dt$segLen)
cn_dt_samp <- cn_dt[, .(nAMP = sum(segVal > 2)), by = sample]
boxplot(cn_dt_samp$nAMP)